In this notebook we will add TP53 alteration status as discussed in #837:
TP53 altered - loss, if :
Note: CNV and SV will be considered as the same event, but we will be adding SV_counts and SV.type to the altered status output file
TP53 altered - activated, if:
library("ggpubr")
Loading required package: ggplot2
library("ggthemes")
library("tidyverse")
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.1 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ lubridate 1.9.2 ✔ tibble 3.2.1
✔ purrr 1.0.1 ✔ tidyr 1.3.0
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggfortify")
library("broom")
# rootdir
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data")
input_dir <- file.path(root_dir,
"analyses",
"tp53_nf1_score",
"input")
# cell composition
cell_line_composition <- read_tsv(file.path("..",
"molecular-subtyping-HGG",
"input",
"cell-line-composition.tsv"),
col_types =
readr::cols( aliquot_id = readr::col_character()))
# histology
if ( params$base_run ==0 ){
clinical<-read.delim(file.path(data_dir,"histologies.tsv"), stringsAsFactors = FALSE)
} else{
clinical<-read.delim(file.path(data_dir,"histologies-base.tsv"), stringsAsFactors = FALSE)
}
# recording of "BS_ETC8R0TD" GMKF NBL sample; needs to be remove once updated in v12
#clinical$RNA_library[clinical$Kids_First_Biospecimen_ID == "BS_ETC8R0TD"] <- "stranded"
histology <- clinical %>%
dplyr::select("Kids_First_Biospecimen_ID",
"sample_id",
"Kids_First_Participant_ID",
"cancer_predispositions",
"sample_type",
"experimental_strategy",
"composition",
"cohort") %>%
# merge cell line composition
left_join(cell_line_composition)
Joining with `by = join_by(Kids_First_Biospecimen_ID, sample_id,
Kids_First_Participant_ID)`
# Cancer database
hotspot_database_2017_tp53_snv <- readxl::read_xls(file.path(input_dir,"hotspots_v2.xls"),sheet = 1) %>%
filter(Hugo_Symbol == "TP53")
hotspot_database_2017_tp53_indel <- readxl::read_xls(file.path(input_dir,"hotspots_v2.xls"),sheet = 2) %>%
filter(Hugo_Symbol == "TP53")
# p53, p63 and p73 functional sites
functional_sites_tp53 <- read_tsv(file.path(input_dir,"hotspot_Chen_2006.tsv"))
Rows: 13 Columns: 3
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): p53, p63, p73
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
results_dir <- file.path(root_dir,
"analyses",
"tp53_nf1_score",
"results")
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
# Read in combined scores from tp53-nf1-classifier
classifier_score <- read_tsv(file.path(results_dir, "combined_scores.tsv")) %>%
dplyr::rename(Kids_First_Biospecimen_ID_RNA = sample_id) %>%
filter(Kids_First_Biospecimen_ID_RNA %in% histology$Kids_First_Biospecimen_ID) %>%
left_join(histology, by=c("Kids_First_Biospecimen_ID_RNA"="Kids_First_Biospecimen_ID"))
Rows: 3891 Columns: 3
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sample_id
dbl (2): tp53_score, tp53_shuffle
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Copy number per sample
# overlapping functional domain
cnv_domain_overlap <- read_tsv(
file.path(
results_dir,
"loss_overlap_domains_tp53.tsv"))
Rows: 237 Columns: 9
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): biospecimen_id, domain, Kids_First_Participant_ID, sample_id, compo...
dbl (3): copy_number, ploidy, tp53_shuffle
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Structural variant overlapping
# or within gene locus of TP53
sv_overlap <- read_tsv(
file.path(
results_dir,
"sv_overlap_tp53.tsv"))
Rows: 94 Columns: 9
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): Kids.First.Biospecimen.ID.Tumor, SV_type, Gene_name, ALT, sample_id
dbl (4): SV_chrom, SV_start, SV_end, SV_length
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Structural variant as a fusion in exon1/intron1
fusion_overlap <- read_tsv(
file.path(
results_dir,
"fusion_bk_tp53_loss.tsv"))
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)
Rows: 90 Columns: 10
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): seqnames, strand, Sample, sample_id, FusionName, variable
dbl (3): start, end, width
time (1): value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
All functional sites are just 1 base so checking in
hotspot_database_2017_snv
functional_sites_tp53 %>%
filter(!gsub("[A-Z|a-z]","",p53) %in% hotspot_database_2017_tp53_snv$Amino_Acid_Position )
2 functional sites are missing in hotspots database.
Removing Silent or Intron classified variants to capture only putative damaging mutations
consensus_tp53_snv_indel <- data.table::fread(
file.path(data_dir,"snv-consensus-plus-hotspots.maf.tsv.gz"),
select = c("Chromosome",
"Start_Position",
"End_Position",
"Strand",
"Variant_Classification",
"Tumor_Sample_Barcode",
"Hugo_Symbol",
"HGVSp_Short"),
data.table = FALSE) %>%
filter(Hugo_Symbol == "TP53") %>%
filter(!(Variant_Classification %in% c("Silent", "Intron",
# remove other non-amino acid SNVs
"3'Flank" ,"5'Flank",
"3'UTR", "5'UTR" )))
hotspots : overlaps AA position which are statistically significant SNV/Indels activating : overlaps AA position which are found to act as gain-of-function according to literature
consensus_tp53_snv_indel <- consensus_tp53_snv_indel %>%
mutate(
hotspot = case_when(
# strip REF and Variant AA to get Amino_Acid_Position in consensus maf
(gsub("[A-Z|a-z]|[.]","",HGVSp_Short) %in%
# if overlaps the hotspot Amino_Acid_Position
hotspot_database_2017_tp53_snv$Amino_Acid_Position) ~ 1,
TRUE ~ 0),
activating = case_when(
# strip REF and Variant AA to get Amino_Acid_Position in consensus maf
(gsub("[A-Z|a-z]|[.]","",HGVSp_Short) %in%
# if overlaps the activating Amino_Acid_Position
c("273","248")) ~ 1,
TRUE ~ 0
)
)
We will gather values per sample_id since DNA and RNA samples can be
only matched by sample_id
tp53_alterations <- histology %>%
filter(experimental_strategy != "RNA-Seq") %>%
# filter for Tumors
filter(sample_type=="Tumor") %>%
# join consensus calls overlapping hotspots
left_join(consensus_tp53_snv_indel,
by=c("Kids_First_Biospecimen_ID"="Tumor_Sample_Barcode")) %>%
# join filtered cnv losses
left_join(cnv_domain_overlap,by=c("Kids_First_Biospecimen_ID"="biospecimen_id",
"sample_id", "Kids_First_Participant_ID",
"composition")) %>%
# join SV
left_join(sv_overlap,by=c("Kids_First_Biospecimen_ID"="Kids.First.Biospecimen.ID.Tumor",
"sample_id")) %>%
dplyr::rename(Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID) %>%
# join fusion
left_join(fusion_overlap,by="sample_id")
Warning in left_join(., cnv_domain_overlap, by = c(Kids_First_Biospecimen_ID = "biospecimen_id", : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 333 of `x` matches multiple rows in `y`.
ℹ Row 24 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
Warning in left_join(., sv_overlap, by = c(Kids_First_Biospecimen_ID = "Kids.First.Biospecimen.ID.Tumor", : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 639 of `x` matches multiple rows in `y`.
ℹ Row 29 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
Warning in left_join(., fusion_overlap, by = "sample_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 5 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
tp53_alterations_score <- tp53_alterations %>%
# add classifier score
full_join(classifier_score,by=c("sample_id",
"composition",
"cell_line_composition",
"cancer_predispositions",
"sample_type",
"Kids_First_Participant_ID",
"cohort")) %>%
# select useful columns
dplyr::select(sample_id,Kids_First_Biospecimen_ID_RNA,Kids_First_Biospecimen_ID_DNA,
tp53_score,cancer_predispositions,HGVSp_Short,copy_number,SV_type,FusionName,hotspot,activating) %>%
arrange(sample_id) %>%
unique() %>%
replace_na(list(hotspot = 0, activating = 0))
Warning in full_join(., classifier_score, by = c("sample_id", "composition", : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 179 of `x` matches multiple rows in `y`.
ℹ Row 3085 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
tp53_alterations_score
Convert the above to wide format
tp53_alterations_score_wide <- tp53_alterations_score %>%
# group
group_by(sample_id,
Kids_First_Biospecimen_ID_DNA,
Kids_First_Biospecimen_ID_RNA,
cancer_predispositions,
tp53_score) %>%
#
# sample_id as rows add SNV counts,CNV loss counts
# and hotspot/activating mutation annotation
summarise(
# summarize length of SNV and CNV alterations
SNV_indel_counts= length(unique(HGVSp_Short[!is.na(HGVSp_Short)])),
CNV_loss_counts = length(unique(copy_number[!is.na(copy_number)])),
SV_counts = length(unique(SV_type[!is.na(SV_type)])),
Fusion_counts = length(unique(FusionName[!is.na(FusionName)])),
HGVSp_Short = toString(unique(HGVSp_Short)),
CNV_loss_evidence = toString(unique(copy_number)),
SV_type = toString(unique(SV_type)),
Fusion_evidence = toString(unique(FusionName)),
# summarize unique hotspot values per sample_id
hotspot = max(unique(hotspot[!is.na(hotspot) ])),
activating = max(unique(activating[!is.na(activating)])))
`summarise()` has grouped output by 'sample_id', 'Kids_First_Biospecimen_ID_DNA', 'Kids_First_Biospecimen_ID_RNA',
'cancer_predispositions'. You can override using the `.groups` argument.
tp53_alterations_score_wide
As discussed above we want to annotate TP53 mutants that are putative loss-of-function OR gain-of-function.
tp53_alterations_score_wide <- tp53_alterations_score_wide %>%
mutate(
# add tp53_altered annotation column
tp53_altered =
case_when(
# when activating == 0
activating == 0 &
# check if mutated variant AA position overlaps hotspot database
( hotspot == 1 |
# check if sample_id has SNV+(CNV|SV) mutation
# suggesting both alleles are mutated
(SNV_indel_counts >= 1 &
(CNV_loss_counts >=1 | SV_counts >=1) ) |
# check if more than 1 SNV is present
# suggesting both alleles are mutated
SNV_indel_counts > 1 |
# check if SNV mutant and has
# Li-Fraumeni syndrome suggesting
# germline TP53 mutant as cancer predisposition
(SNV_indel_counts >= 1 &
grepl("Li-Fraumeni syndrome",cancer_predispositions)) |
# check if CNV|SV mutant and has
# Li-Fraumeni syndrome suggesting
# germline TP53 mutant as cancer predisposition
((CNV_loss_counts >= 1 | SV_counts >=1 ) &
grepl("Li-Fraumeni syndrome",cancer_predispositions)) |
# check if tp53 inactivating score for RNA_Seq is greater that 0.5
# and has Li-Fraumeni syndrome suggesting
# germline TP53 mutant as cancer predisposition
(tp53_score > 0.5 &
grepl("Li-Fraumeni syndrome",cancer_predispositions)) |
# check if tp53 inactivating score for RNA_Seq is greater that 0.5
# and has 1 or more SNV| (CNV|SV) loss in TP53
(tp53_score > 0.5 &
(SNV_indel_counts >= 1 | (CNV_loss_counts >=1 | SV_counts >=1 )))
) ~ "loss",
# when activating == 1
activating == 1 ~ "activated",
# if no evidence supports TP53 inativation or activation
TRUE ~ "Other"
)
)
ggplot(tp53_alterations_score_wide, aes(x = factor(tp53_altered), y = tp53_score)) +
geom_violin()+
geom_jitter(alpha = 0.5, width = 0.2) +
stat_compare_means() +
theme_bw() +
ggtitle("Distribution of scores across tp53 altered status") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))+
xlab("tp53 altered status")
Warning: Removed 3255 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 3255 rows containing non-finite values
(`stat_compare_means()`).
Warning: Removed 3255 rows containing missing values (`geom_point()`).
TP53 mutants with gain-of-function (activating) mutants promote tumorigenesis according to literature. Reference and reference have similar distribution of classifier scores to that of potential loss-of-function (multi-allelic) Tp53 mutations.
“Other” annotated samples either don’t have any high-confidence loss/gain TP53 SNVs nor CNV losses OR DNA sample is not available.
expr_combined <- readRDS(file.path(data_dir,"gene-counts-rsem-expected_count-collapsed.rds"))
# iterate through all possible RNA library type
# first filter to
clinical_filtered <- clinical %>%
filter(Kids_First_Biospecimen_ID %in% names(expr_combined))
# generate list of all RNA library type
rna_library_list <- clinical_filtered %>%
filter(Kids_First_Biospecimen_ID %in% names(expr_combined)) %>%
pull(RNA_library) %>%
unique()
for(i in 1:length(rna_library_list)) {
library_each <- rna_library_list[[i]]
# pull BS IDs for that RNA library type
library_samples <- clinical_filtered %>%
filter(RNA_library == library_each) %>%
pull(Kids_First_Biospecimen_ID) %>%
unique()
# subset expression matrix using that
expr_each <- expr_combined %>%
dplyr::select(library_samples)
# subset to TP53
subset_expr_each <- t(expr_each)[,"TP53"] %>%
as.data.frame()
colnames(subset_expr_each) <- "TP53"
# combine the TP53 alteration information
subset_expr_combined <- subset_expr_each %>%
tibble::rownames_to_column() %>%
left_join(tp53_alterations_score_wide,by=c("rowname"="Kids_First_Biospecimen_ID_RNA")) %>%
filter(tp53_altered %in% c("activated","loss"))
# plot distribution of TP53 gene expression
plot<- ggplot(subset_expr_combined, aes(x = factor(tp53_altered), y = TP53)) +
geom_violin()+
geom_jitter(alpha = 0.5, width = 0.2) +
stat_compare_means() +
theme_bw() +
ggtitle("Distribution of TP53 expression across tp53 altered status") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))+
xlab(paste0("tp53 altered status wth RNA library ", library_each))
print(plot)
}
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(library_samples)
# Now:
data %>% select(all_of(library_samples))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
ggplot(tp53_alterations_score_wide, aes(x = factor(tp53_altered), y = tp53_score)) +
geom_violin()+
geom_jitter(alpha = 0.5, width = 0.2) +
theme_bw() +
ggtitle("Distribution of scores across tp53 altered status") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))+
xlab("tp53 altered status") +
facet_wrap(.~cancer_predispositions)
Warning: Removed 3255 rows containing non-finite values (`stat_ydensity()`).
Warning: Groups with fewer than two data points have been dropped.
Warning in max(data$density): no non-missing arguments to max; returning -Inf
Warning: Computation failed in `stat_ydensity()`
Caused by error in `$<-.data.frame`:
! replacement has 1 row, data has 0
Warning: Groups with fewer than two data points have been dropped.
Warning in max(data$density): no non-missing arguments to max; returning -Inf
Warning: Computation failed in `stat_ydensity()`
Caused by error in `$<-.data.frame`:
! replacement has 1 row, data has 0
Warning: Removed 3255 rows containing missing values (`geom_point()`).
Some NF-1 and/or Other inherited conditions NOS have high scoring TP53 mutants as well.
Interesting 2 bs ids annotated with Li-Fraumeni synfrome pre-disposition have very low tp53 classifier scores.
tp53_alterations_score_wide %>%
filter(grepl("Li-Fraumeni syndrome", cancer_predispositions),
tp53_score < 0.5)
Adding DNA and RNA biospecimen ids to
tp53_alterations_score_wide and saving
tp53_alterations_score_wide %>%
write_tsv(file.path(results_dir,"tp53_altered_status.tsv"))